This project is will use the "global_cancer_patients_2015_2024" from Kaggle which has the below link: "https://www.kaggle.com/datasets/zahidmughal2343/global-cancer-patients-2015-2024/data"
This project will attempt to discover the severity of cancer as a result of the various factors in the dataset such as age, gender, obesity, etc.
It will use K Nearest Neighbours and a Random Forest and compare the methods.
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
import scipy.stats as stats
import statsmodels.formula.api as smf
import statsmodels.api as sm
# For model building
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')
# Load the dataset
df = pd.read_csv('data/global_cancer_patients_2015_2024.csv')
# Display basic information about the dataset
df.info()
# Display the first few rows of the dataset
df.head()
df.describe()
# Can we track a patient through given years
df.groupby('Patient_ID')['Gender'].count()
#What's in the different columns
# for c in df.columns[1:]:
# print(c, df[c].unique())
fig, ax = plt.subplots(figsize=(15, 10))
sns.heatmap(df.drop(columns=['Patient_ID', 'Year']).corr(), robust=True , annot=True, ax = ax, fmt=".1f")
sns.pairplot(df, diag_kind='kde')
# sns.displot(
# df, x="Target_Severity_Score", col="Cancer_Type", row="sex",
# binwidth=3, height=3, facet_kws=dict(margin_titles=True),
# )
g = sns.FacetGrid(df, col="Gender", height=5)
g.map_dataframe(sns.violinplot, y="Target_Severity_Score", x="Cancer_Type")
for ax in g.axes.flat:
for label in ax.get_xticklabels():
label.set_rotation(45)
The Data appears like it has been Machine Generated. There are no correlations between anything except the cancer severity and the prediction variables. Female's don't have prostates so I imagine that it would be difficult for there to be such an even distribution of many women with prostate cancer. We can also see the randomness in the scatterplots between the different variables. There are no examples of no cancer in this dataset so the objective is to predict the severity of the cancer rather than the presence.
df2 = df.drop(columns=['Patient_ID', 'Year', 'Country_Region', 'Survival_Years', 'Treatment_Cost_USD', 'Gender' ,'Cancer_Type'])
df2.insert(1, "Cancer_Stage_no",0)
df2.loc[df2.Cancer_Stage == "Stage I", 'Cancer_Stage_no'] = 1
df2.loc[df2.Cancer_Stage == "Stage II", 'Cancer_Stage_no'] = 2
df2.loc[df2.Cancer_Stage == "Stage III", 'Cancer_Stage_no'] = 3
df2.loc[df2.Cancer_Stage == "Stage IV", 'Cancer_Stage_no'] = 4
df2 = df2.drop(columns=['Cancer_Stage'])
X_train, X_test, y_train, y_test = train_test_split(df2.loc[:, df2.columns != 'Target_Severity_Score'], df2.loc[:, df2.columns == 'Target_Severity_Score'], test_size=.2)
X_train.head()
Unlike the work done in our homework, this is a regression problem.
pot_neigh = [1,2,3,4,5,10,20,50]
r2_scores = []
r2_train_scores = []
MSE_scores = []
MSE_train_scores = []
for i in pot_neigh:
neighbourino = KNeighborsRegressor(n_neighbors=i)
neighbourino.fit(X_train, y_train)
Y_pred = neighbourino.predict(X_test)
Y_train_pred = neighbourino.predict(X_train)
r2_scores.append(r2_score(y_test, Y_pred))
r2_train_scores.append(r2_score(y_train, Y_train_pred))
MSE_scores.append(mean_squared_error(y_test, Y_pred))
MSE_train_scores.append(mean_squared_error(y_train, Y_train_pred))
plt.plot(pot_neigh, r2_scores, label="Test Data", color = 'blue')
plt.plot(pot_neigh, r2_train_scores, label="Train Data", color = 'red')
# Add labels and title
plt.xlabel('Number of Neighbours')
plt.ylabel('r2')
plt.title('impact of Number of Neighbours on R2')
plt.legend(loc="upper right")
# Show plot
plt.show()
plt.plot(pot_neigh, MSE_scores, label="Test Data", color = 'blue')
plt.plot(pot_neigh, MSE_train_scores, label="Train Data", color = 'red')
# Add labels and title
plt.xlabel('Number of Neighbours')
plt.ylabel('MSE')
plt.title('impact of Number of Neighbours on MSE')
plt.legend(loc="upper right")
# Show plot
plt.show()
Although the train data MSE and R squared values both preffered a model with less neighbours, that increased the variance of the model and resulted in poorer results in the out of sample predictions. This is a clear example of where including bias for a more robust model is appropriate.
Random Forests are an amalgamation of individual decision trees.
griddy=[]
RF = RandomForestRegressor()
depthy = [1,3,5,8,10]
nesty = [10,50,100,200]
parammers={'max_depth':depthy, 'n_estimators':nesty}
grid_search = GridSearchCV(estimator=RF,param_grid=parammers, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)
results = pd.DataFrame(grid_search.cv_results_)
pivot_table = results.pivot_table(
values='mean_test_score',
index='param_max_depth',
columns='param_n_estimators'
)
plt.figure(figsize=(8,6))
sns.heatmap(pivot_table, annot=True, fmt=".3f", cmap='viridis')
plt.title('Grid Search CV Results')
plt.xlabel('n_estimators')
plt.ylabel('max_depth')
plt.show()
best_model = grid_search.best_estimator_
Y_RF_pred = best_model.predict(X_test)
RF_R2 = r2_score(y_test, Y_RF_pred)
RF_MSE = mean_squared_error(y_test, Y_RF_pred)
print(best_model.get_params())
print("Random forest Best Model R Squared: {:.3f}".format(RF_R2))
print("Random forest Best Model Mean Squared Error: {:.3f}".format(RF_MSE))
# print(pd.DataFrame(grid_search.cv_results_))
RF2 = RandomForestRegressor(n_estimators= 50, max_depth=8)
alt_model = RF2.fit(X_train, y_train)
Y_RF_pred_alt = alt_model.predict(X_test)
RF_R2_alt = r2_score(y_test, Y_RF_pred_alt)
RF_MSE_alt = mean_squared_error(y_test, Y_RF_pred_alt)
print("Random forest Alt Model R Squared: {:.3f}".format(RF_R2_alt))
print("Random forest Alt Model Mean Squared Error: {:.3f}".format(RF_MSE_alt))
The random forest gives similar results to the KNN approach, however the computing power required for the approach appears to be far more. Unlike with the simplified approach that I applied for grid search in KNN, the best model was the best out of sample model in the Random forest approach. This is likely due to cv feature in grid_search which already accounts for out of sample performance.